#####
## figure_e1.R
## This file creates Figure E.1.
## The input file is localauthority_level.dta. 
#####

# preamble
rm(list = ls())
library(janitor)
library(foreign)
library(haven)
library(tidyverse)

# open
K <- read_dta("localauthority_level.dta")
K <- K %>% select(territory_level, year, ethnicity, majority, la_clerk, clerketh, percent_in_la) %>% 
           arrange(territory_level, year, ethnicity)

# create variables for clerk and majority ethnicity
K <- K %>% group_by(territory_level, year) %>% 
      mutate(maj_ethnicity = ifelse(majority == 1, ethnicity, NA),
             trueclerk_ethnicity = ifelse(clerketh == 1, ethnicity, NA)) %>% 
      fill(maj_ethnicity, trueclerk_ethnicity, .direction = c("downup"))
K$maj_ethnicity[is.na(K$maj_ethnicity)] <- "NO MAJORITY"
K$trueclerk_ethnicity[is.na(K$trueclerk_ethnicity)] <- "NO CLERK"

# create categorical var for clerk_status
K <- K %>% mutate(clerk_status = case_when(maj_ethnicity == trueclerk_ethnicity ~ "Co-ethnic clerk",
                                      maj_ethnicity != trueclerk_ethnicity & trueclerk_ethnicity != "NO CLERK" & maj_ethnicity != "NO MAJORITY" ~ "Non co-ethnic clerk",
                                      maj_ethnicity != trueclerk_ethnicity & trueclerk_ethnicity == "NO CLERK" & maj_ethnicity != "NO MAJORITY" ~ "Majority with no clerk",
                                      maj_ethnicity == "NO MAJORITY" ~ "No majority"))

# true coeth clerk 
K <- K %>% mutate(true_coethclerk = ifelse(clerk_status == "Co-ethnic clerk", 1, 0))

# collapse to LA / year 
K <- K %>% distinct(territory_level, year, la_clerk, maj_ethnicity, trueclerk_ethnicity, clerk_status, true_coethclerk)

# get number of clerks in a year 
clerksyear <- K %>% group_by(year) %>% summarise(n_clerksyear = sum(la_clerk))

# loop 1000 times
rando_list <- list()
for(i in 1:1000){
    # new DF 
    print(i)
    DF <- K
    
    # scramble clerk_ethnicity across LAs 
    DF$rando_clerketh <- DF$trueclerk_ethnicity[sample(1:nrow(DF), size = nrow(DF), replace = F)]
    
    # add sim and filter 
    DF$sim <- i
    DF <- DF %>% select(territory_level, year, maj_ethnicity, trueclerk_ethnicity,
                                        rando_clerketh, sim)
    
    # add n_coethclerk to a list and sim #
    DF$n_clerketh <- sum(DF$maj_ethnicity == DF$rando_clerketh) 
    
    # add to rando_list
    rando_list[[i]] <- DF
    rm(DF)
}
  
# unlist and compare 
rando_df <- do.call(bind_rows, rando_list)
rando_df_collapsed <- rando_df %>% ungroup() %>% distinct(sim, n_clerketh) 
true_coethclerk <- K %>% ungroup() %>%
  summarise(real_coethclerk = sum(clerk_status == "Co-ethnic clerk"))
  
# plot the density 
rando_df_collapsed %>% ggplot(aes(x = n_clerketh)) + geom_histogram(bins = 100) + 
  geom_vline(data = true_coethclerk, aes(xintercept = real_coethclerk, color = "True number of co-ethnic clerks"),
             size = 1.2) + 
  scale_y_continuous(limits = c(0, 300), breaks = seq(0, 350, by = 50), ) + 
  scale_x_continuous(limits = c(0, 700), breaks = seq(0, 700, by = 100)) + 
  labs(x = "Number of co-ethnic clerks in each randomization", y = "Frequency",
       title = "") +
  ggthemes::theme_stata(base_size = 22, scheme = "s1color") + 
  theme(panel.grid.major = element_line(linetype = "blank"), plot.background = element_rect(fill = "white"),
        legend.title = element_blank(),
        legend.background = element_rect(color = NA),
        legend.position = c(0.67, .92),
        legend.text = element_text(size = 12))







